%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Matlab code for estimation of person and establishment effects.
%  This is closely based on code created by Card, Heining, and Kline (2013).
clear
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%      DIRECTORIES      %%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
datadir="<datadir>"; %data directory
outputdir="<ouputdir>"; %output directory
path(path,["<path>"]); %path to the matlabBGL files 
%note: matlab BGL obtained from http://www.mathworks.com/matlabcentral/fileexchange/10922
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%      PARAMETERS      %%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
samplesize='100P';                          %Sample size options: '10P' or '100P 
minhours=2000;                               %Earnings threshold: the number of hours worked at federal minimum wage 520 or 2080
minwage=7.25;                               %Using 2013 min wage 
labsuprest='';                              %Labor supply restriction or not: either leave blank for put "_Sandwich"
gender = 'Male';                            %Select the gender: "Male" or "Female"
popage = '20-60';                           %Select the age range (anywhere between 18 and 65)

if minhours==520
    a='1';
elseif minhours==2000
    a='2';
end 
if strcmp(labsuprest,'')==1
    b='0';
elseif strcmp(labsuprest,'_Sandwich')==1
    b='1';
end
spec=['' a '_' b '_' int2str(minhours) 'hr' labsuprest ''];    %Estimation specification. Use 1 for 520 hrs, 2 for 1500 hrs.
intlgth=7;                                  %Length of intervals
intstep=1;                                  %Interval step in loop (for _Sandwich can set this to 4, only first and last)
intend=5;                                   %Interval end in loop
norm1=40;                                   %a parameter for the age normalization
norm2=40;                                   %a parameter for the age normalization
normtext=['age=(age-' int2str(norm1) ')/' int2str(norm2) ''];
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

s=['' outputdir '/logs/S' samplesize '_AKMest_' gender '_Int' int2str(intlgth) '_' spec '.log'];
diary(s)

disp('*****************************************************');
disp('*                                                   *');
s=['Sample and Spec: ' spec ', ' popage ', ' gender ', ' normtext ''];
disp(s);
disp('*                                                   *');
disp('*****************************************************');

for i=1:intstep:intend
s=['Loading data for interval' int2str(intlgth) '_' int2str(i) '...'];
disp(s);
%LOAD DATA 
s=['' datadir '/S' samplesize '_' gender '_Int' int2str(intlgth) '_' int2str(i) '.txt'];
data=csvread(s);

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%     PCE DEFLATE, APPLY MIN EARNINGS THRESHOLD      %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%input pce deflator       
pce = [1980 2.446041202; 1981 2.245386992; 1982 2.127905367; 1983 2.040091790; 1984 1.965718880;
       1985 1.898519264; 1986 1.858310156; 1987 1.803386421; 1988 1.735760157; 1989 1.664145047;
       1990 1.595077106; 1991 1.544420835; 1992 1.504629759; 1993 1.467978548; 1994 1.438070666;
       1995 1.408821835; 1996 1.379464228; 1997 1.356057837; 1998 1.345726581; 1999 1.326248305;
       2000 1.294005846; 2001 1.269495846; 2002 1.252687108; 2003 1.228383502; 2004 1.199201810;
       2005 1.165953111; 2006 1.135576223; 2007 1.107836171; 2008 1.075021236; 2009 1.075720000;
       2010 1.058227499; 2011 1.032866374; 2012 1.013673071; 2013 1.000000000];
%merge into data set    
[~,ii] = ismember(data(:,2),pce(:,1));
data = [data, pce(ii,2)];
%adjust for inflation
data(:,4) =data(:,4).*data(:,10);
%keep only those above earnings threshold
data = data(logical(data(:,4)>=minhours*minwage),:);
data = sortrows(data,[1 2]);

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%     OPTIONAL: LABOR SUPPLY RESTRICTIONS            %
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
disp('If stable job specification 1 otherwise 0');
disp(strcmp(labsuprest,'_Sandwich'));
if strcmp(labsuprest,'_Sandwich')==1 
    %keep only those with sandwiched jobs
    data = data(logical(data(:,9)==1),:);
end

%create new lag firm id
id=data(:,1);
lagid = [-9; data(1:end-1,1)];
lagfirmid = [-9; data(1:end-1,3)];
lagfirmid(id~=lagid)=-9;
data(:,11)=lagfirmid;
data=data(:,[1 2 3 11 4 5 6 7 8 9]);
clear id lagid lagfirmid;

id=data(:,1);
year=data(:,2);
firmid=data(:,3);
lagfirmid=data(:,4);
y=log(data(:,5));
birth=data(:,6);
indcode=data(:,7);
geocode=data(:,8);
n_jobs=data(:,9);
i_sw=data(:,10);
clear data

firmid_old=firmid;
id_old=id;
lagfirmid(lagfirmid==-9)=NaN;

%RENAME
disp('Relabeling ids...')
N=length(y);
sel=~isnan(lagfirmid);

%relabel the firms
[firms,m2,n]=unique([firmid;lagfirmid(sel)]);

firmid=n(1:N);
lagfirmid(sel)=n(N+1:end);


%relabel the workers
[ids,m2,n]=unique(id);
id=n;

%initial descriptive stats
fprintf('\n')
disp('Some descriptive stats:')
s=['# of p-y obs: ' int2str(length(y))];
disp(s);
s=['# of workers: ' int2str(max(id))];
disp(s);
s=['# of firms: ' int2str(max(firmid))];
disp(s);
s=['mean wage: ' num2str(mean(y))];
disp(s)
s=['variance of wage: ' num2str(var(y))];
disp(s)
fprintf('\n')


if  i == 1
    COL3='D10';
elseif i == 2
    COL3='D13';
elseif i == 3
    COL3='D16';
elseif i == 4
    COL3='D19';
elseif i == 5
    COL3='D22';
end

out3=[length(y), max(id), max(firmid), mean(y), median(y), std(y)];

filename=['' outputdir '/Tables/S' samplesize '_' spec '_Table_2.xlsx'];
s3=['Table_2_' gender ''];
xlswrite(filename,out3,s3,COL3);

%FIND CONNECTED SET
disp('Finding connected set...')
A=sparse(lagfirmid(sel),firmid(sel),1); %adjacency matrix
%make it square
[m,n]=size(A);
if m>n
    A=[A,zeros(m,m-n)];
end
if m<n
    A=[A;zeros(n-m,n)];
end
A=max(A,A'); %connections are undirected

[sindex, sz]=components(A); %get connected sets
idx=find(sz==max(sz)); %find largest set
s=['# of firms: ' int2str(length(A))];
disp(s);
s=['# connected sets:' int2str(length(sz))];
disp(s);
s=['Largest connected set contains ' int2str(max(sz)) ' firms'];
disp(s);
fprintf('\n')
clear A lagfirmid
firmlst=find(sindex==idx); %firms in connected set
sel=ismember(firmid,firmlst);

y=y(sel); firmid=firmid(sel); id=id(sel);
year=year(sel); id_old=id_old(sel); firmid_old=firmid_old(sel);
birth=birth(sel); 
indcode=indcode(sel);geocode=geocode(sel);n_jobs=n_jobs(sel);i_sw=i_sw(sel);
%fsize=fsize(sel); 
N=length(y); 

disp('Relabeling ids again...')
%relabel the firms
[firms,m2,n]=unique(firmid);
firmid=n;

%relabel the workers
[ids,m2,n]=unique(id);
id=n;

%descriptive stats for connected set
fprintf('\n')
disp('Now restricted to the largest connected set');
s=['# of p-y obs: ' int2str(length(y))];
disp(s);
s=['# of workers: ' int2str(max(id))];
disp(s);
s=['# of firms: ' int2str(max(firmid))];
disp(s);

s=['mean wage: ' num2str(mean(y))];
disp(s)
s=['variance of wage: ' num2str(var(y))];
disp(s)
fprintf('\n')

%ESTIMATE AKM
disp('Building matrices...')
NT=length(y);
N=max(id);
J=max(firmid);

D=sparse(1:NT,id',1);
F=sparse(1:NT,firmid',1);

S=speye(J-1);
S=[S;sparse(-zeros(1,J-1))];  %N+JxN+J-1 restriction matrix 

%Build time trends
yrmin=min(year); yrmax=max(year);

R=sparse(1:NT,(year-yrmin+1),1); %year effects
R(:,1)=[]; %drop first year effect

age=year-birth;
age=(age-norm1)/norm2; %rescale to avoid big numbers

A=[age.^2,age.^3]; %age cubic
Z=[R,A];

clear R A

X=[D,F*S,Z];

disp('Running AKM...')
tic
xx=X'*X;
xy=X'*y;
L=ichol(xx,struct('type','ict','droptol',1e-2,'diagcomp',.1));
b=pcg(xx,xy,1e-10,1000,L,L');
toc
disp('Done')
clear xx xy L

%ANALYZE RESULTS
xb=X*b;
r=y-xb;

disp('Goodness of fit:')
dof=NT-J-N+1-size(Z,2)
RMSE=sqrt(sum(r.^2)/dof)

TSS=sum((y-mean(y)).^2);
R2=1-sum(r.^2)/TSS
adjR2=1-sum(r.^2)/TSS*(NT-1)/dof

ahat=b(1:N);
ghat=b(N+1:N+J-1);
bhat=b(N+J:end);
disp('check for problems with year effects. should report zero')
sum(bhat==0)

pe=D*ahat;
fe=F*S*ghat;
xb=X(:,N+J:end)*bhat;

clear D F X b

disp('Variance-Covariance of worker and firm effs (p-y weighted)');
cov(pe,fe)
disp('Correlation coefficient');
corr(pe,fe)
disp('Means of person/firm effs')
mean([pe,fe])

disp('Full Covariance Matrix of Components')
disp('    y      pe      fe      xb      r')
C=cov([y,pe,fe,xb,r])

disp('Decomposition #1')
disp('var(y) = cov(pe,y) + cov(fe,y) + cov(xb,y) + cov(r,y)');
c11=C(1,1); c21=C(2,1); c31=C(3,1); c41=C(4,1); c51=C(5,1);
s=[num2str(c11) ' = ' num2str(c21) ' + ' num2str(c31) ' + ' num2str(c41) ' + ' num2str(c51)];
disp(s)
fprintf('\n')
disp('explained shares:    pe       fe       xb       r')
s=['explained shares: ' num2str(c21/c11) '  ' num2str(c31/c11) '  ' num2str(c41/c11) '  ' num2str(c51/c11)];
disp(s)

fprintf('\n')
disp('Decomposition #2')
disp('var(y) = var(pe) + var(fe) + var(xb) + 2*cov(pe,fe) + 2*cov(pe,xb) + 2*cov(fe,xb) + var(r)');
c11=C(1,1); c22=C(2,2); c33=C(3,3); c44=C(4,4); c55=C(5,5); 
c23=C(2,3); c24=C(2,4); c34=C(3,4);
s=[num2str(c11) ' = ' num2str(c22) ' + ' num2str(c33) ' + ' num2str(c44) ' + '  num2str(2*c23) ' + ' num2str(2*c24) ' + ' num2str(2*c34) ' + ' num2str(c55)];
disp(s)
fprintf('\n')
disp('explained shares:    pe      fe      xb   cov(pe,fe)   cov(pe,xb)   cov(fe,xb)   r')
s=['explained shares: ' num2str(c22/c11) '  ' num2str(c33/c11) '  ' num2str(c44/c11) '  ' num2str(2*c23/c11) '  ' num2str(2*c24/c11) '  ' num2str(2*c34/c11) '  ' num2str(c55/c11)];
disp(s)
fprintf('\n')

%joint distribution and separability
fedec = ceil(10 * tiedrank(fe) / length(fe));
pedec = ceil(10 * tiedrank(pe) / length(pe));
for j=1:10
    for k=1:10
        p(j,k)=mean((pedec==j)&(fedec==k));
        meanr(j,k)=mean(r.*(pedec==j).*(fedec==k))/p(j,k);
    end
end
disp('Joint distribution of effects (rows are deciles of pe, cols are deciles of fe)')
p
disp('Mean residual by pe/fe decile')
meanr

clear fedec pedec

%match effects
disp('Match Effects Model')
dig=ceil(max(log10(firmid)));
firmiddec=firmid./(10^dig);
matchid=id+firmiddec;
[matchnew,m2,n]=unique(matchid);
matchid=n;

M=sparse(1:NT,matchid',1);
X=[M,Z];
xx=X'*X;
xy=X'*y;
L=ichol(xx,struct('type','ict','droptol',1e-2,'diagcomp',.1));
b=pcg(xx,xy,1e-10,1000,L,L');
r_match=y-X*b;
dof_match=NT-size(X,2)
RMSE_match=sqrt(sum(r_match.^2)/dof_match)
R2_match=1-sum(r_match.^2)/TSS
adjR2_match=1-sum(r_match.^2)/TSS*(NT-1)/dof_match
clear Z r_match b

%further decomposition
fprintf('\n')
disp('Further Decompositions:')
disp('Decomposing residual into match and transitory component')

xx=M'*M;
xy=M'*r;
m=M*(xx\xy);
e=r-m;
disp('Full Covariance Matrix of Components')
disp('    y      pe      fe      xb      m      e')
C=cov([y,pe,fe,xb,m,e])

clear M

%even further decomposition
disp('Decomposing transitory component into firm/year and person component')
F2=sparse(1:NT,(year-yrmin+1)*J+firmid,1);
F2=F2(:,any(F2,1));
xx=F2'*F2;
xy=F2'*e;
L=ichol(xx,struct('type','ict','droptol',1e-2));
bf=pcg(xx,xy,1e-10,1000,L,L');
f=F2*bf;
e2=e-f;
disp('Full Covariance Matrix of Components')
disp('    y      pe      fe      xb      m      f      e2')
C=cov([y,pe,fe,xb,m,f,e2])

clear F2 xx xy

%SAVE OUTPUT (this will write to the current directory)
disp('Saving main effects')

out=[id_old firmid_old year pe fe xb r y birth indcode geocode n_jobs i_sw];
s=['' outputdir '/output_data/S' samplesize '_AKMeffs_' spec '_' gender '_Int' int2str(intlgth) '_' int2str(i) '.txt'];
dlmwrite(s, out, 'delimiter', '\t', 'precision', 16);

%save bhats
disp('Computing/saving bhat crosswalk')
age=year-birth;

groupid=age+year*100; %unique combinations of age and year
[idnew,m2,n]=unique(groupid);
G=sparse(1:NT,n,1);
xx=G'*G;
L=ichol(xx,struct('type','ict','droptol',1e-2));
xy=G'*age;
bage=pcg(xx,xy,1e-10,1000,L,L');
xy=G'*year;
byear=pcg(xx,xy,1e-10,1000,L,L');
xy=G'*xb;
bxb=pcg(xx,xy,1e-10,1000,L,L');

out=[round([bage byear]) bxb];
s=['' outputdir '/bhat4e.txt'];
%dlmwrite(s, out, 'delimiter', '\t', 'precision', 16);

if  i == 1
    COL1='C7';
    COL2='C9';
    COL4='K10';
elseif i == 2
    COL1='D7';
    COL2='E9';
    COL4='K13';
elseif i == 3
    COL1='E7';
    COL2='G9';
    COL4='K16';
elseif i == 4
    COL1='F7';
    COL2='I9';
    COL4='K19';
elseif i == 5
    COL1='G7';
    COL2='K9';
    COL4='K22';
end

out1=[ N ; J; NT; std(y); NaN; std(pe) ; std(fe); std(xb); corr(pe,fe) ; corr(pe,xb); corr(fe,xb); RMSE ; adjR2; NaN; RMSE_match ; adjR2_match; std(m)];

covpefe=2*cov(pe,fe);
covpexb=2*cov(pe,xb);
covfexb=2*cov(fe,xb);
covpefe=covpefe(2,1);
covpexb=covpexb(2,1);
covfexb=covfexb(2,1);

out2=[var(y); NaN; var(pe) ; var(fe); var(xb) ; var(r) ; covpefe ; covpexb; covfexb];

filename=['' outputdir '/Tables/S' samplesize '_' spec '_Table_3_4.xlsx'];
s1=['Table_3_' gender ''];
s2=['Table_4_' gender ''];
xlswrite(filename,out1,s1,COL1);
xlswrite(filename,out2,s2,COL2);

out4=[NT, N, J, mean(y), median(y), std(y)];

filename=['' outputdir '/Tables/S' samplesize '_' spec '_Table_2.xlsx'];
s4=['Table_2_' gender ''];
xlswrite(filename,out4,s4,COL4);

clearvars -except datadir outputdir samplesize specnumb spec minhours intlgth intstep intend gender popage norm1 norm2 normtext i minwage labsuprest;

end

diary off;

